##FKF

This template is a simple way to jump right into your analysis. The libraries included are most of my used libraries and will get you started with tidy data, ggplot, and more!

library(tidyr)
library(ggplot2)
library(ggimage)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(stringr)
library(forcats)
library(quanteda)
## Package version: 4.1.0
## Unicode version: 15.1
## ICU version: 74.1
## Parallel computing: 12 of 12 threads used.
## See https://quanteda.io for tutorials and examples.
library("quanteda.textplots")
library("quanteda.textstats")
library(extrafont)
## Registering fonts with R
library(wordcloud)
## Loading required package: RColorBrewer
library(tidytext)
library(knitr)
## Warning: package 'knitr' was built under R version 4.4.2

The data used in this data set was pulled from the IMDb website with a web scraper I created using python. I looked up B-Horror movies genre by the most popular and pulled the top 100 titles at the time. However, popularity changes over time and thus so will the titles that get pulled from the web scarper. Therefore popularity is not focused on in this analysis.

Let’s take a look at what was pulled.

horror <- read_csv('spookySZN_clean.csv')
## New names:
## Rows: 100 Columns: 13
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (9): Movie, Director, Writer1, Writer2, Star1, Star2, MovieRating, RunTi... dbl
## (3): ...1, Rating, Released num (1): PopularityScore
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
horror
## # A tibble: 100 × 13
##     ...1 Movie  Rating Released Director Writer1 Writer2 Star1 Star2 MovieRating
##    <dbl> <chr>   <dbl>    <dbl> <chr>    <chr>   <chr>   <chr> <chr> <chr>      
##  1     0 Hallo…    7.7     1978 John Ca… John C… Debra … Dona… Jami… R          
##  2     1 The T…    7.4     1974 Tobe Ho… Kim He… Tobe H… Mari… Edwi… R          
##  3     2 The E…    7.4     1981 Sam Rai… Sam Ra… <NA>    Bruc… Elle… NC-17      
##  4     3 Tremo…    7.2     1990 Ron Und… S.S. W… Brent … Kevi… Fred… PG-13      
##  5     4 Frida…    6.4     1980 Sean S.… Victor… Ron Ku… Bets… Adri… R          
##  6     5 The B…    6.5     1999 Daniel … Daniel… Eduard… Heat… Mich… R          
##  7     6 Sleep…    6.2     1983 Robert … Robert… <NA>    Feli… Jona… R          
##  8     7 Kille…    6.2     1988 Stephen… Charle… Stephe… Gran… Suza… PG-13      
##  9     8 Night…    7.8     1968 George … John A… George… Duan… Judi… R          
## 10     9 Frida…    6.1     1981 Steve M… Ron Ku… Victor… Bets… Amy … R          
## # ℹ 90 more rows
## # ℹ 3 more variables: RunTime <chr>, Synopsis <chr>, PopularityScore <dbl>

This section focuses on some manipulation I did to the data frame to help with my analysis.

#Want to change the data type of runTime to runTime in minutes. 
#First I will remove the 1h from the character string. 
#I will have to save this into our data frame before trying to parse out "m"
horror <- horror %>% 
  mutate(runTimeMins =str_remove(horror$RunTime, "1h "))

#parse out the "m" character
horror <- horror %>% 
  mutate(runTimeMins = str_remove(horror$runTimeMins, "m")) %>% 
  #change the datatype from chr to numeric and add 60 
  #since this is a small data set I can clearly see that all are at least an hour long and less than 2 hours
  mutate(runTimeMins = as.numeric(runTimeMins) + 60) %>%
  #Lets take a look at this by the decade
  #created a decade column to group the releases of these movies
  mutate(Decade = Released %/% 10 * 10 )


#This new dataframe will be helpful for our text analysis of our Synopsis later on
tidyHoror <- horror %>% 
  #we want to create a row for each word shown in the Synopsis variable
  unnest_tokens(word, Synopsis) %>% 
  #Lets remove stop words that will provide little to no importance
  anti_join(stop_words)
## Joining with `by = join_by(word)`

The following section focuses on summary statistics.

horror %>% 
  summary()
##       ...1          Movie               Rating         Released   
##  Min.   : 0.00   Length:100         Min.   :1.500   Min.   :1957  
##  1st Qu.:24.75   Class :character   1st Qu.:3.875   1st Qu.:1978  
##  Median :49.50   Mode  :character   Median :5.100   Median :1984  
##  Mean   :49.50                      Mean   :4.866   Mean   :1990  
##  3rd Qu.:74.25                      3rd Qu.:5.800   3rd Qu.:2010  
##  Max.   :99.00                      Max.   :7.800   Max.   :2024  
##                                                                   
##    Director           Writer1            Writer2             Star1          
##  Length:100         Length:100         Length:100         Length:100        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##     Star2           MovieRating          RunTime            Synopsis        
##  Length:100         Length:100         Length:100         Length:100        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  PopularityScore  runTimeMins        Decade    
##  Min.   : 288    Min.   : 63.0   Min.   :1950  
##  1st Qu.: 787    1st Qu.: 83.0   1st Qu.:1970  
##  Median :1935    Median : 89.0   Median :1980  
##  Mean   :2280    Mean   : 88.2   Mean   :1986  
##  3rd Qu.:3921    3rd Qu.: 95.0   3rd Qu.:2010  
##  Max.   :4864    Max.   :112.0   Max.   :2020  
##  NA's   :83
#popularity score is not helpful because of how many do not have one.
#Many titles do not have a popularity score due to so many B-Horror movies being niche titles

#Let's see the distribution of ratings and the run times (in minutes) for the titles we pulled

horror %>%  
  ggplot(aes(x = Rating)) +
  geom_histogram(binwidth = .5)

horror %>%  
  ggplot(aes(x = Rating)) +
  geom_density()

horror %>% 
  ggplot() + 
  geom_histogram(aes(x = runTimeMins), , binwidth = 5)

horror %>% 
  ggplot() + 
  geom_density(aes(x = runTimeMins))

I thought it would be interesting to pull directors writers and stars. So we do some basic counts to see if we can identify creative professionals that continuously appear in different titles.

# Most directors only have one film in this list, but a select few have 2 or more.

horror %>% 
  count(Director) %>%
  arrange(desc(n))
## # A tibble: 88 × 2
##    Director                   n
##    <chr>                  <int>
##  1 Herschell Gordon Lewis     3
##  2 Wes Craven                 3
##  3 Andrew Jones               2
##  4 Bruno Mattei               2
##  5 Edward D. Wood Jr.         2
##  6 James Nguyen               2
##  7 Mark Polonia               2
##  8 Roger Corman               2
##  9 Tobe Hooper                2
## 10 Umberto Lenzi              2
## # ℹ 78 more rows
#How many times do actors appear in these movies? 
#Most are unique actors in these movies, however, a few show up in 2 titles. 

horror %>% 
  #Since we have 2 columns of Stars (Star1 and Star2) Let's make this data frame longer to include them all in one column 
  pivot_longer(Star1:Star2, names_to = "Star", values_to = "Actor") %>% 
  count(Actor) %>% 
  arrange(desc(n))
## # A tibble: 195 × 2
##    Actor              n
##    <chr>          <int>
##  1 Alan Bagh          2
##  2 Betsy Palmer       2
##  3 Robert Houston     2
##  4 Whitney Moore      2
##  5 William Kerwin     2
##  6 Adan Canto         1
##  7 Adrian Zmed        1
##  8 Adrienne King      1
##  9 Alicia Sanz        1
## 10 Allison Hayes      1
## # ℹ 185 more rows
#How many times does a writer appear in this data set? 
#The most prevalent writer in this data set is Wes Craven, unsurprisingly!

horror %>% 
  pivot_longer(Writer1:Writer2, names_to = "Writer", values_to = "Names") %>% 
  #remove some missing values because IMDB does not always specify a second writer, or there is none. 
  filter(!is.na(Names)) %>% 
  count(Names) %>% 
  arrange(desc(n))
## # A tibble: 155 × 2
##    Names                  n
##    <chr>              <int>
##  1 Wes Craven             3
##  2 Claudio Fragasso       2
##  3 Edward D. Wood Jr.     2
##  4 James Nguyen           2
##  5 John A. Russo          2
##  6 Ron Kurz               2
##  7 Victor Miller          2
##  8 Adam Simon             1
##  9 Al Ramrus              1
## 10 Albert Lawrence        1
## # ℹ 145 more rows

We focus on single variables again to see the top movies in this data set and how many titles were released per year.

#What are the highest rated movies? 

#Top 10 B-horror movies on IMDB based on IMDB rating

horror %>%
  arrange(desc(Rating)) %>% 
  select(Movie, MovieRating, Rating ) %>%
  head(10)
## # A tibble: 10 × 3
##    Movie                        MovieRating Rating
##    <chr>                        <chr>        <dbl>
##  1 Night of the Living Dead     R              7.8
##  2 Halloween                    R              7.7
##  3 The Texas Chain Saw Massacre R              7.4
##  4 The Evil Dead                NC-17          7.4
##  5 Curse of the Demon           Approved       7.4
##  6 Tremors                      PG-13          7.2
##  7 Carnival of Souls            PG             7  
##  8 White Dog                    PG             7  
##  9 The Blair Witch Project      R              6.5
## 10 The Raven                    G              6.5
#How many releases are there by year? 

#The year with the most B-Horror titles released was 1981

horror %>% 
  count(Released) %>% 
  arrange(desc(n))
## # A tibble: 48 × 2
##    Released     n
##       <dbl> <int>
##  1     1981     9
##  2     1980     6
##  3     1984     6
##  4     2013     4
##  5     2022     4
##  6     1982     3
##  7     1983     3
##  8     1987     3
##  9     1990     3
## 10     1993     3
## # ℹ 38 more rows
#These are all of the titles released in 1981 from this data

horror %>% 
  filter(Released == 1981)
## # A tibble: 9 × 15
##    ...1 Movie   Rating Released Director Writer1 Writer2 Star1 Star2 MovieRating
##   <dbl> <chr>    <dbl>    <dbl> <chr>    <chr>   <chr>   <chr> <chr> <chr>      
## 1     2 The Ev…    7.4     1981 Sam Rai… Sam Ra… <NA>    Bruc… Elle… NC-17      
## 2     9 Friday…    6.1     1981 Steve M… Ron Ku… Victor… Bets… Amy … R          
## 3    13 The Bu…    6.3     1981 Tony Ma… Harvey… Tony M… Bria… Leah… R          
## 4    20 Happy …    6       1981 J. Lee … John C… Peter … Meli… Glen… R          
## 5    22 Saturd…    4.6     1981 Howard … Jeff B… Howard… Rich… Paul… PG         
## 6    33 Cannib…    5.1     1981 Umberto… Umbert… <NA>    Giov… Lorr… <NA>       
## 7    39 Madman     5.1     1981 Joe Gia… Joe Gi… Gary S… Gayl… Tony… R          
## 8    43 Absurd     5.3     1981 Joe D'A… George… <NA>    Geor… Anni… Not Rated  
## 9    67 Final …    4.6     1981 Jimmy H… Jimmy … <NA>    Ceci… Joel… R          
## # ℹ 5 more variables: RunTime <chr>, Synopsis <chr>, PopularityScore <dbl>,
## #   runTimeMins <dbl>, Decade <dbl>
#How many releases are there by decade? 

#Most of these B-Horror movies were released in the 80's
#We see sort of a "resurgence" of these movies in the 2010's

horror %>% 
  count(Decade) %>% 
  arrange(desc(n))
## # A tibble: 8 × 2
##   Decade     n
##    <dbl> <int>
## 1   1980    32
## 2   2010    17
## 3   1970    11
## 4   1990    11
## 5   1960     9
## 6   2020     9
## 7   1950     6
## 8   2000     5
#there was an increase in releases from the 1960's peaking in the 1980s
#Followed by a decline in releases until the 2010s. 

plot_decade <- horror %>% 
  count(Decade) %>% 
  arrange(desc(n)) %>% 
  ggplot(aes(x = Decade, y = n)) + 
  geom_line(color = "red") +
  geom_label(aes(x = Decade, y = n, label = n, vjust = .5),) + 
  theme(text = element_text(family = "Nosifer", color = "white"),
        panel.background = element_rect(fill = "black"), 
        panel.grid = element_blank(), 
        axis.line = element_line(color = "white"), 
        axis.text.y = (element_text(color = "white")), 
        axis.text.x = element_text(color = "white")) +
  labs(y = "Count of Movies", 
       title = "Number TOP B-Horror Movies by Decade", 
       subtitle = "From IMDB's 100 most popular B-Horror movies",
       caption = "Source - imdb.com\nImage-MGM")

ggbackground(plot_decade, "images/poltergeist.jpeg")

The number of releases around the 1980’s is interesting in this data set. Lets dig a little deeper.

#Have B-Horror movies gotten better, worse,or stayed the same over time? 

ratingPlot <- horror %>%
  select(Movie, Rating, Released) %>% 
  ggplot(aes(x = Released, y = Rating)) + 
  geom_line(color = "white") +
  geom_hline(aes(yintercept = mean(Rating)), color = "orange4") +
  theme(text = element_text(family = "Nosifer", color = "red"),
    axis.text.x = element_text(angle = 45, vjust = .75, hjust = .80, size = 10, color = "white"),
    axis.title.x = element_blank(),
    axis.ticks.x = element_line(color = "white"),
    axis.text.y = (element_text(color = "white")),
    panel.grid = element_blank())+
  labs(title = "B-Horror Movie Ratings Throughout the Years", 
       caption = "Data Source - imdb.com\nImage - evil.fandom.com/wiki/Crystal_Lake")

ggbackground(ratingPlot, "images/The_Crystal_Lake.jpeg")

#The late 1960s through the 1980s experienced above average ratings of movies 
#Afterwards we see a loss in consistency and have a lot of volatility

plot <- horror %>% 
  arrange(desc(Rating)) %>% 
  filter(Decade >= 1960, Decade <= 1980) %>% 
  mutate(Movie = as.factor(Movie)) %>% 
  mutate(Movie = fct_reorder(Movie, Rating, .fun = sum, .desc = TRUE)) %>% 
  head(10) %>% 
  ggplot(aes(x = Movie, y = Rating)) +
  geom_col(aes(x = Movie, y = Rating), fill = "darkorange",  alpha = .66) +
 coord_cartesian(ylim=c(6,8.5))+
  geom_label(aes(x = Movie, y = Rating, label = Released)) + 
  theme(text = element_text(family = "Nosifer", color = "red"),
    axis.text.x = element_text(angle = 45, vjust = .75, hjust = .80, size = 6, color = "white"),
    axis.title.x = element_blank(),
    axis.text.y = (element_text(color = "white")),
    plot.background = element_rect(fill = "black"), 
    panel.grid = element_blank(),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    strip.background = element_blank()) +
  labs(y = "IMDB Ratings", 
       title = "The Golden Age of B-Horror", 
       subtitle = "Highest rated B-Horror Movies from the 1960's - 1980's",
       caption = "Data Source - imdb.com\n Image - evildead.fandom.com/wiki/The_Knowby_Cabin") 

ggbackground(plot, "images/CabinExterior-EvilDead2.jpeg")

Out of curiosity, I wanted to know how run times have changed over time.

#have movies gotten longer over time? 
#Movies seem to have gotten longer after the 1960's, but have stayed relatively the same length since the 1980's


horror %>% 
  ggplot(aes(x = Released, y = runTimeMins)) +
  geom_point() + 
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

The remaining sections area all about text analysis. I haven’t tried my hand at text mining / text analysis so we will be digging into the user submitted synopsis for the 100 titles we pulled!

This sections is all about preprocessing using the quanteda package.

horror
## # A tibble: 100 × 15
##     ...1 Movie  Rating Released Director Writer1 Writer2 Star1 Star2 MovieRating
##    <dbl> <chr>   <dbl>    <dbl> <chr>    <chr>   <chr>   <chr> <chr> <chr>      
##  1     0 Hallo…    7.7     1978 John Ca… John C… Debra … Dona… Jami… R          
##  2     1 The T…    7.4     1974 Tobe Ho… Kim He… Tobe H… Mari… Edwi… R          
##  3     2 The E…    7.4     1981 Sam Rai… Sam Ra… <NA>    Bruc… Elle… NC-17      
##  4     3 Tremo…    7.2     1990 Ron Und… S.S. W… Brent … Kevi… Fred… PG-13      
##  5     4 Frida…    6.4     1980 Sean S.… Victor… Ron Ku… Bets… Adri… R          
##  6     5 The B…    6.5     1999 Daniel … Daniel… Eduard… Heat… Mich… R          
##  7     6 Sleep…    6.2     1983 Robert … Robert… <NA>    Feli… Jona… R          
##  8     7 Kille…    6.2     1988 Stephen… Charle… Stephe… Gran… Suza… PG-13      
##  9     8 Night…    7.8     1968 George … John A… George… Duan… Judi… R          
## 10     9 Frida…    6.1     1981 Steve M… Ron Ku… Victor… Bets… Amy … R          
## # ℹ 90 more rows
## # ℹ 5 more variables: RunTime <chr>, Synopsis <chr>, PopularityScore <dbl>,
## #   runTimeMins <dbl>, Decade <dbl>
topHorrorList <- horror %>% 
  arrange(desc(Rating)) %>% 
  select(Movie) %>% 
  head(10) %>% 
  as.vector()

topHorrorList
## $Movie
##  [1] "Night of the Living Dead"     "Halloween"                   
##  [3] "The Texas Chain Saw Massacre" "The Evil Dead"               
##  [5] "Curse of the Demon"           "Tremors"                     
##  [7] "Carnival of Souls"            "White Dog"                   
##  [9] "The Blair Witch Project"      "The Raven"
is.vector(topHorrorList)
## [1] TRUE
topHorrorList2 <- c("Halloween", "Night of the Living Dead", "The Texas Chain Saw Massacre", 
                   "The Evil Dead", "Curse of the Demon", "Tremors", "Carnival of Souls", "White Dog", 
                   "The Blair Witch Project", "The Raven")

is.vector(topHorrorList2)
## [1] TRUE
horrorcorp = corpus(horror,text_field = "Synopsis")

dtm <- dfm(tokens(horrorcorp, remove_punct = TRUE, 
                  remove_symbols = TRUE, 
                  remove_separators = TRUE, remove_numbers = TRUE), tolower = TRUE) %>%
  dfm_remove(stopwords('en')) %>% 
  dfm_trim(min_termfreq = 8) 

dtm
## Document-feature matrix of: 100 documents, 50 features (90.58% sparse) and 14 docvars.
##        features
## docs    night police death years one find dr back time town
##   text1     2      1     1     1   1    1  1    1    1    1
##   text2     0      0     0     0   0    0  0    0    0    0
##   text3     0      0     0     0   3    1  0    0    1    0
##   text4     0      0     0     0   2    0  0    0    1    2
##   text5     0      0     0     0   2    0  0    0    0    0
##   text6     0      0     0     0   1    1  0    0    0    0
## [ reached max_ndoc ... 94 more documents, reached max_nfeat ... 40 more features ]
horror2corp = horror %>% 
  arrange(desc(Rating)) %>% 
  filter(Decade >= 1960, Decade <= 1980) %>% 
  corpus(text_field = "Synopsis")

dtm2 <- dfm(tokens(horror2corp, remove_punct = TRUE, 
                  remove_symbols = TRUE, 
                  remove_separators = TRUE, remove_numbers = TRUE), tolower = TRUE) %>%
  dfm_remove(stopwords('en')) %>% 
  dfm_trim(min_termfreq = 6)


horror_dfm <- tidyHoror %>%  
  filter(Movie %in% topHorrorList2) %>%
  count(Movie, word, sort = TRUE) %>% 
  cast_dfm(Movie, word, n)

Now let’s make a simple word cloud for all of the titles. Also, for movies released during the 1960s and 1980s

dtm %>% 
  dfm_remove(c('>', '<', 'one', 'ed', 'however', 'goes')) %>%
  textplot_wordcloud(color = "darkred", ordered_color = T) 

dtm %>% 
  dfm_remove(c('>', '<', 'one', 'ed', 'however', 'goes')) %>%
  textstat_frequency(n = 50) 
##       feature frequency rank docfreq group
## 1         new        22    1      18   all
## 2        town        21    2      14   all
## 3      people        20    3      17   all
## 4        find        17    4      16   all
## 5       house        17    4      10   all
## 6         two        16    6      13   all
## 7        back        15    7      14   all
## 8      family        15    7      11   all
## 9       years        14    9      13   all
## 10      night        13   10       9   all
## 11       soon        13   10      12   all
## 12     killer        13   10      12   all
## 13       dead        12   13      10   all
## 14       camp        12   13       7   all
## 15      group        12   13      10   all
## 16     police        11   16      10   all
## 17      young        11   16      11   all
## 18       life        11   16      10   all
## 19      human        11   16      10   all
## 20         dr        10   20       9   all
## 21       take        10   20      10   all
## 22     escape        10   20       9   all
## 23 mysterious        10   20       9   all
## 24       time         9   24       9   all
## 25    friends         9   24       8   all
## 26     killed         9   24       6   all
## 27      local         9   24       9   all
## 28        now         9   24       9   all
## 29      later         9   24       9   all
## 30       must         9   24       9   all
## 31       body         9   24       9   all
## 32      death         8   32       7   all
## 33   students         8   32       8   all
## 34       evil         8   32       5   all
## 35      takes         8   32       8   all
## 36      jason         8   32       3   all
## 37       lake         8   32       3   all
## 38    zombies         8   32       6   all
## 39     around         8   32       8   all
## 40       base         8   32       4   all
## 41      begin         8   32       8   all
## 42       also         8   32       8   all
## 43    colonel         8   32       2   all
## 44      party         8   32       6   all
## 45     father         8   32       6   all
## 46      blood         8   32       7   all
#Since most of the above average rated movies were between the 1960's and the 1990's I decided to make a wordcloud for this era

dtm2 %>%   
  dfm_remove(c('>', '<', 'one', 'ed', 'humberto', 'amador', 'norman', 'also')) %>%
  textplot_wordcloud() 

The focus will now be on the top 10 rated movies in the data set. I want to explore what words carry the most weight in their synopsis This remainder of this analysis was guided/inspired by Julie Siege’s topic modeling tutorials.

If you would like to see more of Julie Silge’s work you can take a look at her blog here: https://juliasilge.com/blog/ She is a great data scientist and prevelant figure in the R community!

tidyHoror %>% 
  filter(Movie %in% topHorrorList2) %>% 
  count(Movie, word, sort = TRUE) %>% 
  bind_tf_idf(word, Movie, n) %>% 
  group_by(Movie) %>% 
  slice_max(word, n = 3) %>% 
  ungroup() %>% 
  mutate(word = reorder(word, tf_idf)) %>% 
  ggplot(aes(word, tf_idf, fill = Movie)) + 
  geom_col(show.legend = FALSE) + 
  facet_wrap(~Movie, scales = "free") + 
  coord_flip()

library(stm)
## stm v1.3.7 successfully loaded. See ?stm for help. 
##  Papers, resources, and other materials at structuraltopicmodel.com
tidyHorror_dfm <- tidyHoror %>% 
  count(Movie, word, sort = TRUE) %>% 
  filter(word !="10") %>% 
  filter(word !="11944") %>% 
  filter(word != "_tremors_") %>% 
  cast_dfm(Movie, word, n)

topic_model <- stm(tidyHorror_dfm, K = 6, init.type = "Spectral")
## Beginning Spectral Initialization 
##   Calculating the gram matrix...
##   Finding anchor words...
##      ......
##   Recovering initialization...
##      ......................
## Initialization complete.
## ....................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 1 (approx. per word bound = -7.264) 
## ....................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 2 (approx. per word bound = -6.600, relative change = 9.140e-02) 
## ....................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 3 (approx. per word bound = -6.430, relative change = 2.581e-02) 
## ....................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 4 (approx. per word bound = -6.395, relative change = 5.439e-03) 
## ....................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 5 (approx. per word bound = -6.390, relative change = 8.146e-04) 
## Topic 1: town, family, creature, hunt, house 
##  Topic 2: accident, students, dead, jason, dr 
##  Topic 3: house, life, night, police, ed 
##  Topic 4: killer, camp, killed, people, start 
##  Topic 5: colonel, julie, base, secret, human 
##  Topic 6: josh, blood, missing, todd, mike 
## ....................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 6 (approx. per word bound = -6.384, relative change = 8.975e-04) 
## ....................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 7 (approx. per word bound = -6.381, relative change = 4.600e-04) 
## ....................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 8 (approx. per word bound = -6.379, relative change = 2.916e-04) 
## ....................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Completing Iteration 9 (approx. per word bound = -6.379, relative change = 5.422e-05) 
## ....................................................................................................
## Completed E-Step (0 seconds). 
## Completed M-Step. 
## Model Converged
summary(topic_model)
## A topic model with 6 topics, 100 documents and a 2230 word dictionary.
## Topic 1 Top Words:
##       Highest Prob: town, creature, family, house, home, christmas, escape 
##       FREX: creature, christmas, home, town, figure, shock, torgo 
##       Lift: shock, torgo, tourists, 2, 2010, apparently, attic 
##       Score: legendary, creature, christmas, torgo, tourists, shock, mountain 
## Topic 2 Top Words:
##       Highest Prob: accident, students, dead, jason, dr, nancy, doctor 
##       FREX: holden, horrible, jimmy, karswell, nest, prom, spiders 
##       Lift: 18th, 1961, 23, 9, acula, affair, aided 
##       Score: frankenstein, accident, nancy, holden, karswell, horrible, jimmy 
## Topic 3 Top Words:
##       Highest Prob: house, life, night, police, ed, begin, mary 
##       FREX: bukowski, barbra, criminals, days, gang, phyllis, tom 
##       Lift: accept, accrues, addicted, advantage, affected, agatha, agatha's 
##       Score: musicians, house, bukowski, mary, criminals, phyllis, tom 
## Topic 4 Top Words:
##       Highest Prob: killer, camp, killed, people, start, lake, maze 
##       FREX: maze, dallas, nico, alice, crystal, beth, rogue 
##       Lift: game, jan, l.a, maze, murderer, play, 1907 
##       Score: 1907, maze, crystal, start, killer, dallas, nico 
## Topic 5 Top Words:
##       Highest Prob: colonel, julie, base, secret, human, reynolds, people 
##       FREX: julie, reynolds, curt, skin, causing, control, project 
##       Lift: 11th, 1981, abound, acid, actress, adams, advertising 
##       Score: all's, julie, reynolds, colonel, curt, skin, base 
## Topic 6 Top Words:
##       Highest Prob: josh, blood, missing, mike, todd, local, escaped 
##       FREX: josh, todd, mike, escaped, blood, missing, terry 
##       Lift: terry, toxic, attractive, blamed, bloody, boys, brothers 
##       Score: descend, todd, josh, terry, toxic, escaped, mike
td_beta <- tidy(topic_model)

td_beta %>% 
  group_by(topic) %>% 
  slice_max(beta, n = 10) %>% 
  ungroup() %>% 
  mutate(term = reorder(term, beta)) %>% 
  ggplot(aes(term, beta, fill = term)) + 
  geom_col(show.legend = FALSE) + 
  facet_wrap(~topic, scales = "free") + 
  coord_flip()

td_gamma <- tidy(topic_model, matrix = "gamma", 
                 document_names = rownames(tidyHorror_dfm))

td_gamma %>% 
  ggplot(aes(gamma, fill = as.factor(topic))) + 
           geom_histogram(show.legend = FALSE) + 
           facet_wrap(~topic, ncol = 3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.